import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.linear_model import LogisticRegression
# importing ploting libraries
import matplotlib.pyplot as plt
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
#importing seaborn for statistical plots
import seaborn as sns
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats
# calculate accuracy measures and confusion matrix
from sklearn import metrics
df = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
df.head()
df.shape
#Basic Info
df.info()
#Lets analysze the distribution of the various attribute
df.describe().transpose()
df.isnull().sum()
df.isnull().values.any()
df.nunique() # Number of unique values in a column
df[df['Mortgage']==0]['Mortgage'].count()
print (df.CreditCard.value_counts()) # no of people with Zero Card _ no spending
print (df.Family.value_counts())
print (df.Education.value_counts())
print (df.Online.value_counts())
print (df['CD Account'].value_counts())
print (df['Personal Loan'].value_counts())
print (df['Securities Account'].value_counts())
!pip install pandas_profiling
import pandas_profiling
df.profile_report()
# -ve values
df[df['Experience']<0]['Experience'].count()
sns.pairplot(df.iloc[:,1:])
df.dtypes
plt.figure(figsize = (15,7))
plt.title ('Correlation Attribuites', y=1.025, size=15)
sns.heatmap(df.corr(), cmap='plasma', annot=True, fmt='0.2f')
No Missing values Min value of experience is -3, experience cant be -ve, either there a is exprience +ve integer or zero value Data is mix of Numeric (age, ID, Income, Mortgage etc), categorical (Family, Eductaion), and Boolean (CD Account, Credit card, Online etc) variables. strong correlation between age and experience co-efficient is 0.99 Personal Loan is correlable with Income, avergae spending on credit cards, mortgage (positive correlation)
for i in ['Age', 'Experience', 'Income', 'CCAvg', 'Education','Mortgage', 'Online', 'CreditCard', 'Family','Personal Loan', 'Securities Account', 'CD Account']:
sns.distplot(df[i])
plt.show()
df['Family'].value_counts(normalize=True)
df['Education'].value_counts(normalize=True)
df['Securities Account'].value_counts(normalize=True)
df['CD Account'].value_counts(normalize=True)
df['Online'].value_counts(normalize=True)
df['CreditCard'].value_counts(normalize=True)
df['Personal Loan'].value_counts(normalize=True)
data_df.boxplot(return_type='axes', figsize=(20,5))
90% did not accept personal Loan 70% have no credit cards 40% dont use internet bank system 30% have one kid in family Mortgage data has a lot of outliers we can remove ID, Experience and Zip code data - not really significant in this case.
data_df=df.drop(['ID','Experience', 'ZIP Code'], axis=1)
data_df.head()
pd.crosstab(data_df['Family'],data_df['Personal Loan'],normalize='index')
pd.crosstab(data_df['Education'],data_df['Personal Loan'],normalize='index')
pd.crosstab(data_df['Securities Account'],data_df['Personal Loan'],normalize='index')
pd.crosstab(data_df['CD Account'],data_df['Personal Loan'],normalize='index')
pd.crosstab(data_df['Online'],data_df['Personal Loan'],normalize='index')
pd.crosstab(data_df['CreditCard'],data_df['Personal Loan'],normalize='index')
PLoan_counts=pd.DataFrame(df['Personal Loan'].value_counts()).reset_index()
PLoan_counts.columns =['Category','Personal Loan']
PLoan_counts
fig1, ax1 = plt.subplots()
explode = (0, 0.25)
ax1.pie(PLoan_counts["Personal Loan"], explode=explode, labels=PLoan_counts["Category"], autopct='%1.2f%%',
shadow=True, startangle=90)
ax1.axis('equal')
plt.title("% Personal Loan")
plt.show()
sns.catplot(x='Family', y='Income', hue='Personal Loan', data = df, kind='swarm')
sns.boxplot(x='Education', y='Income', hue='Personal Loan', data = df)
sns.boxplot(x="Education", y='Mortgage', hue="Personal Loan", data=df)
sns.countplot(x='Family',data=df,hue='Personal Loan')
df.groupby('Personal Loan')['CCAvg'].mean().plot(kind='bar')
df.groupby('Personal Loan')['Income'].mean().plot(kind='bar')
high salary Over 80-100K accepted Personal loan in this case. High CCAvg has accepted personal loan. CD account holders accept personal loan more.
## Define X and Y variables
X = data_df.drop('Personal Loan', axis=1)
Y = data_df['Personal Loan'].astype('category')
# change data typr to 'category' because it was object type which is not recognized by model
#Convert categorical vriables to dummy variables
X = pd.get_dummies(X, drop_first=True)
##Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.30,random_state=7)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=4294967295,fit_intercept=False)
logreg.fit(X_train, y_train) # fit the model on train data
y_predict = logreg.predict(X_test) # Predicting the target variable on test datalogreg_model = LogisticRegression()
# Observe the predicted and observed classes in a dataframe.
z = X_test.copy()
z['Observed Personal Loan'] = y_test
z['Predicted Personal Loan'] = y_predict
z.head()
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()## function to get confusion matrix in a proper format
print("Trainig accuracy",logreg.score(X_train,y_train))
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
y_predict=logreg_model.predict(X_test)
print(classification_report(y_test,y_predict))
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test, y_predict))
# !pip install yellowbrick
# Additional
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
## Feature Importance or Coefficients
fi = pd.DataFrame()
fi['Col'] = X_train.columns
fi['Coeff'] = np.round(abs(logreg.coef_[0]),2)
fi.sort_values(by='Coeff',ascending=False)
Logistic Regression for binary output prediction Personal Loan variable is used as dependent variable. split data is 30-70% Accuracy (training & testing) is 91% but what it means- majority of customer are non buyers (high %) as compared to small % of buyer of Personal Loan. So accuracy of this model is debateable. Though 61% of recall means this model did well in predicting positive, but alarming AUC value- area under the curve is 67%. May be filtering of data for example outliers for Mortgage, Salary may increase accuracy, recall and AUC - but that might be overdoing as we already know higher salary customers are buying product of personal Loan. Same thing with scaling the attributes as next step or using other models to confirm the trends. Its obvious for marketting the Personal Loan , product needs to be more attractive for low to medium income as those customers are not in the list and improve the decision tree.